We need to get this to a usable place. The raw data is scattered over columns in a way similar to how ancestry and tissue were scattered, so this script looks pretty similar.

Cleaning up and pre-processing

We begin by taking a quick look at the tissue type descriptors to get some numbers… Some of these clearly do not contain cell type info, but others do, and sorting those out is joyous.

table(allSRAFinal$Cell_type) # Useful
## 
##                                      acute myeloblastic leukemia                            adipose-derived stromal cells (hASCs) 
##                                                                6                                                               54 
##                                  alveolar basal epithelial cells                                          aortic endothelial cell 
##                                                                5                                                                4 
##                                          Aortic enothelial cells                                                          B cells 
##                                                                3                                                               35 
##                                                     B-Lymphocyte                                               BBB microvascaular 
##                                                               30                                                               29 
##                                                 blood leukocytes                                                    breast cancer 
##                                                               12                                                               33 
##                                              Cardiac fibroblasts                                                    cardiomyocyte 
##                                                              166                                                               12 
##                                 Carotid artery endothelial cells                                             CD138-negative cells 
##                                                                6                                                               10 
##                                             CD138-positive cells                                               CD14 Primary Cells 
##                                                               10                                                                1 
##                                      CD14+ peripheral blood cell                                               CD19 Primary Cells 
##                                                                4                                                                1 
##                                                            CD19+                                                CD3 Primary Cells 
##                                                               23                                                                1 
##                                               CD34 Primary Cells                             CD34+ sorted bone marrow progenitors 
##                                                                1                                                               12 
##                                                CD4 Primary Cells                                                       CD4 T cell 
##                                                                2                                                               10 
##                                                      CD4 T cells                                                     CD4+ T cells 
##                                                               79                                                               42 
##                                               CD56 Primary Cells                                      CD56+ peripheral blood cell 
##                                                                1                                                                4 
##                                                CD8 Primary Cells                                                       CD8 T cell 
##                                                                2                                                                8 
##                                                      CD8 T cells                                                      CD8+ T cell 
##                                                               53                                                               22 
##                                                              cDC                                     CELL MIXTURE - tissue sample 
##                                                               29                                                                1 
##                                               classical monocyte                                 CLEAR CELL ADENOCARCINOMA\\, NOS 
##                                                               34                                                                1 
##                                                              cMo                                     cord blood mononuclear cells 
##                                                              110                                                               30 
##                                Coronary artery endothelial cells                                                 cortical neurons 
##                                                                4                                                                4 
##                                                 endocardial cell                                                Endocardial cells 
##                                                                1                                                                3 
##                                      ENDOMETRIOID ADENOCARCINOMA                      endothelial cell of artery (common carotid) 
##                                                                1                                                                8 
##                         endothelial cell of great saphenous vein                             endothelial cell of umbilical artery 
##                                                                3                                                                1 
##                                                Endothelial cells                                                       epithelial 
##                                                               48                                                               67 
##                                                       Epithelial                                                 epithelial cells 
##                                                               25                                                                4 
##                                                  epithelial-like                                            epithelial-like cells 
##                                                                1                                                                4 
##                                                       fibroblast                                                       Fibroblast 
##                                                               38                                                                4 
##                                                      fibroblasts                                                        Firoblast 
##                                                               10                                                               22 
##                              Granule Cell of Human Dentate Gyrus                            great saphenous vein endothelial cell 
##                                                              258                                                                1 
##                                  Hematopoietic Stem Cells (HSCs)                                                       Hepatocyte 
##                                                              730                                                               20 
##                                                          HepLPCs                                                      HepLPCs-Hep 
##                                                                8                                                                2 
##                                   Human Aortic Endothelial Cells                          Human Brachiocephalic Endothelial Cells 
##                                                                6                                                                2 
##                    Human Cardiac Microvascular Endothelial Cells                                      Human Common Carotid Artery 
##                                                                1                                                                1 
##                          Human Coronary Artery Endothelial Cells                                          Human Endocardial Cells 
##                                                                5                                                                1 
##                         Human primary bronchial epithelial cells                         Human Pulmonary Artery Endothelial Cells 
##                                                              100                                                                4 
##                         Human Umbilical Artery Endothelial Cells                           Human Umbilical Vein Endothelial Cells 
##                                                                2                                                                4 
##                                    induced pluripotent stem cell                                   induced pluripotent stem cells 
##                                                              332                                                               18 
##                                                             iPSC                                           iPSC derived cell line 
##                                                               45                                                              250 
##                                  iPSC derived Motor Neuron (iMN)                                      iPSC-derived cardiomyocytes 
##                                                               11                                                               51 
##                                     iPSC-derived cortical neuron                             lung microvascular endothelial cells 
##                                                                6                                                               16 
##                                                      lymphoblast                                                   Lymphoblastoid 
##                                                              159                                                               38 
##                                                M1-polarized HMDM                                                       melanocyte 
##                                                               48                                                                1 
##                                   Microvascular endothelial cell MIXED CLEAR CELL\\, ENDOMETRIOID AND PAPILLARY SEROUS CARCINOMAS 
##                                                                4                                                                1 
##                                                 mononuclear cell                                                            monos 
##                                                               20                                                              198 
##                                                     Naive B cell                                                 Nasal epithelium 
##                                                               21                                                               18 
##                                             Natural Killer Cells                                                       Neutrophil 
##                                                               12                                                               18 
##                                                            neuts                                                    NORMAL TISSUE 
##                                                              201                                                                8 
##                                                         organoid                                  PAPILLARY SEROUS ADENOCARCINOMA 
##                                                               39                                                                3 
##                 PAPILLARY SEROUS AND ENDOMETRIOID ADENOCARCINOMA                                       PAPILLARY SEROUS CARCINOMA 
##                                                                1                                                                8 
##                              PAPILLARY SEROUS CYSTADENOCARCINOMA                                                             PBMC 
##                                                                4                                                              209 
##                                   PBMC derived macrophage (HMDM)                                                            PBMCs 
##                                                               48                                                               12 
##                                                              PD1                                                              pDC 
##                                                               74                                                               30 
##                               peripheral blood mononuclear cells                               Peripheral blood mononuclear cells 
##                                                               24                                                               54 
##                                                         Platelet                                                              PMN 
##                                                               99                                                               61 
##                                                          primary                                                     primary cell 
##                                                               16                                                               12 
##                                                     Primary cell                                              Primary hepatocytes 
##                                                               32                                                                2 
##                             Primary vascular smooth muscle cells                                                         prostate 
##                                                                8                                                               12 
##                                                  prostate cancer                               Pulmonary artery endothelial cells 
##                                                               24                                                                2 
##                                   Renal artery endothelial cells                                            SEROUS ADENOCARCINOMA 
##                                                                7                                                                2 
##                                                 SEROUS CARCINOMA                               SEROUS SURFACE PAPILLARY CARCINOMA 
##                                                                4                                                                3 
##                            Small Airway Epithelial Cells (SAECs)                                    small airway epithelium (SAE) 
##                                                               36                                                               12 
##                                                        Stem Cell                             superior vena cava endothelial cells 
##                                                                8                                                                1 
##                                                           T cell                                                          T cells 
##                                                               18                                                               33 
##                                                              Tfh                                                             Treg 
##                                                               74                                                               74 
##                               Umbilical artery endothelial cells                                 Umbilical vein endothelial cells 
##                                                                2                                                                3 
##                                                White blood cells                                                      Whole blood 
##                                                               67                                                               28
table(allSRAFinal$Tissue) # Useful
## 
##                                                                           AC                                                                   ACD plasma 
##                                                                           18                                                                           12 
##                                                 Adenocarcinoma of the kidney                                        Adenocarcinoma of the large intestine 
##                                                                            1                                                                            2 
##                                                   Adenocarcinoma of the lung                                                   Adenocarcinoma\\, invasive 
##                                                                            2                                                                            2 
##                                       Adenocarcinoma\\, invasive of the lung                                              Adenocarcinoma\\, mixed subtype 
##                                                                            1                                                                            1 
##                                      Adenocarcinoma\\, papillary predominant                                                                      adipose 
##                                                                            1                                                                           50 
##                                                              Adjacent normal                                                       Adjacent normal tissue 
##                                                                            4                                                                           14 
##                                                                      Adrenal                                                              adult cartilage 
##                                                                            1                                                                            3 
##                                                                        Aorta                                                        Apical left ventricle 
##                                                                           51                                                                           32 
##                                                                Atrial Tissue                                                            autonomic_ganglia 
##                                                                           24                                                                           16 
##                                                               Base of Tongue                                                                biliary_tract 
##                                                                           10                                                                            8 
##                                                                          BLA                                                                      bladder 
##                                                                           60                                                                            1 
##                                                               Bladder cancer                                                                        blood 
##                                                                            5                                                                         4873 
##                                                                        Blood                             blood plasma enriched for extracellular vesicles 
##                                                                         1287                                                                           82 
##                                                                         bone                                                                  bone marrow 
##                                                                           26                                                                           33 
##                                                                  Bone marrow                                                                  Bone Marrow 
##                                                                           47                                                                           42 
##                                                                        brain                                                                        Brain 
##                                                                          288                                                                          135 
##                                                       Brain – choroid plexus                                                                Brain (DLPFC) 
##                                                                           15                                                                           32 
##                                                            Brain capillaries                                                                       breast 
##                                                                           29                                                                          137 
##                                                                       Breast                                                         breast cancer tissue 
##                                                                            9                                                                           82 
##                                                             Brodmann Area 38                                                                Cancer tissue 
##                                                                           53                                                                           14 
##                                                           Carotid body tumor                                                                      Caudate 
##                                                                           24                                                                          131 
##                                                                           CE                                                                        cecum 
##                                                                           60                                                                            2 
##                                                       central_nervous_system                                                            CHD whole blood 1 
##                                                                           64                                                                            1 
##                                                            CHD whole blood 2                                                            CHD whole blood 3 
##                                                                            1                                                                            1 
##                                                            CHD whole blood 4                                                            CHD whole blood 5 
##                                                                            1                                                                            1 
##                                                            CHD whole blood 6                                      Clear cell adenocarcinoma of the kidney 
##                                                                            1                                                                            3 
##                                                                        colon                                                                 Colon biopsy 
##                                                                          132                                                                           16 
##                                                            colorectal cancer                                                        Common carotid artery 
##                                                                           39                                                                            6 
##                                                                   Connective                                                            Connective tissue 
##                                                                           22                                                                           18 
##                                                              Coronary artery                                                                           DC 
##                                                                            4                                                                            4 
##                                                                 distal ileum                                                                        DLPFC 
##                                                                            1                                                                           23 
##                                               Dorsolateral prefrontal cortex                                                                  EDTA plasma 
##                                                                           80                                                                           12 
##                                                                  Endocardium                                                           endometrial tissue 
##                                                                            3                                                                           10 
##                                                                  endometrium                                                                            F 
##                                                                           28                                                                            6 
##                                                               FALLOPIAN TUBE                                                                          Fat 
##                                                                            8                                                                           54 
##                                                                 femoral head                                                                   fibroblast 
##                                                                            4                                                                            8 
##                                                               Floor of Mouth                                                                  foetal lung 
##                                                                            1                                                                           12 
##                                                                     Foreskin                                                             Gastric (Benign) 
##                                                                            4                                                                            2 
##                                                             Gastric (Normal)                                                              Gastric (Tumor) 
##                                                                            3                                                                           36 
##                                                                     gingival                                                         Great Saphenous vein 
##                                                                           12                                                                            3 
##                                           haematopoietic_and_lymphoid_tissue                                                             HC whole blood 1 
##                                                                          178                                                                            1 
##                                                             HC whole blood 2                                                             HC whole blood 3 
##                                                                            1                                                                            1 
##                                                             HC whole blood 4                                                             HC whole blood 5 
##                                                                            1                                                                            1 
##                                                             HC whole blood 6                                                      Heart\\, left ventricle 
##                                                                            1                                                                            1 
##                                                         Hematopoietic tissue                                                               Heparin plasma 
##                                                                           20                                                                           44 
##                                        Hepatocellular carcinoma of the liver                                              Hepatocellular carcinoma\\, NOS 
##                                                                            1                                                                            2 
##                                                                           HF                                   http://purl.obolibrary.org/obo/BTO_0000784 
##                                                                            2                                                                           20 
##                                                            Human Hippocampus                                                                  Hyppocampus 
##                                                                          258                                                                           12 
##                                                                          ICV                                                                          Ile 
##                                                                           59                                                                            2 
##                                                                 Ileal biopsy                                                                  Ileal pouch 
##                                                                          245                                                                           39 
##                                                                        Ileum Induced pluripotent stem cell line derived from adult skin (leg) fibroblasts 
##                                                                          195                                                                           18 
##                       induced pluripotent stem cell-derived cortical neurons                                                                         iPSC 
##                                                                           22                                                                           24 
##                  iPSC_CRISPR_correcte-derived cortical neuron\\, replicate 1                  iPSC_CRISPR_correcte-derived cortical neuron\\, replicate 2 
##                                                                            1                                                                            1 
##                  iPSC_CRISPR_correcte-derived cortical neuron\\, replicate 3                                         iPSC_CRISPR_corrected\\, replicate 1 
##                                                                            1                                                                            1 
##                                         iPSC_CRISPR_corrected\\, replicate 2                                         iPSC_CRISPR_corrected\\, replicate 3 
##                                                                            1                                                                            1 
##                                                           iPSC-derived cells                                  iPSC-derived cortical neuron\\, replicate 1 
##                                                                          304                                                                            1 
##                                  iPSC-derived cortical neuron\\, replicate 2                                  iPSC-derived cortical neuron\\, replicate 3 
##                                                                            1                                                                            1 
##                                                          iPSC\\, replicate 1                                                          iPSC\\, replicate 2 
##                                                                            1                                                                            1 
##                                                          iPSC\\, replicate 3                                                                       kidney 
##                                                                            1                                                                           33 
##                                                                Kidney cortex                                                               Kidney medulla 
##                                                                            1                                                                            1 
##                                                              large_intestine                                                                       Larynx 
##                                                                           58                                                                            1 
##                                                                  Left Atrium                                                                    Left Lung 
##                                                                            8                                                                           60 
##                                                               Left Ventricle                                                                leukapheresis 
##                                                                            8                                                                            1 
##                                                                        liver                                                                        Liver 
##                                                                           71                                                                          229 
##                                                          Liver\\, Epithelial                                                                           LN 
##                                                                           12                                                                           66 
##                                                                         lung                                                                         Lung 
##                                                                          198                                                                           67 
##                                          Lung (Endotracheal aspirate\\, ETA)                               lung; derived from metastatic site: lymph node 
##                                                                            4                                                                            6 
##                                                                   lymph node                                                                  Lymphocytes 
##                                                                            2                                                                           18 
##                                                                            M  mammary gland/breast cancer; derived from metastatic site: pleural effusion 
##                                                                            2                                                                           21 
##         mammary gland/breast; derived from metastatic site: pleural effusion      mammary gland\\, breast; derived from metastatic site: pleural effusion 
##                                                                            1                                                                           24 
##                                                         Maternal Whole Blood                                                                  mediastinum 
##                                                                          238                                                                            2 
##                                                                     melanoma                                                    metastatic ovarian cancer 
##                                                                            1                                                                          129 
##                                                                      missing                                                   myocardium (atrial biopsy) 
##                                                                            4                                                                           53 
##                                                                          Nac                                                                          NAC 
##                                                                          130                                                                           58 
##                                                             Nasal epithelium                                     Nasal olfactory mucosa: middle turbinate 
##                                                                           18                                                                           17 
##                                                                    Nasal RNA                                                    Normal bladder epithelium 
##                                                                           65                                                                            5 
##                                                         normal breast tissue                                                              normal prostate 
##                                                                            5                                                                           16 
##                                                                   oesophagus                                                                      OMENTUM 
##                                                                           25                                                                            9 
##                     Organotypic raft cultures (ORC) containing keratinocytes                                                 ovarian clear cell carcinoma 
##                                                                           23                                                                            9 
##                                                                        ovary                                                                        Ovary 
##                                                                           48                                                                           10 
##                                                                        OVARY                                                     ovary cystadenocarcinoma 
##                                                                           10                                                                            9 
##                                                                  OVARY TUMOR                                                               ovary: ascites 
##                                                                            8                                                                           21 
##                                                                     pancreas                                                                pancreas/duct 
##                                                                           71                                                                           16 
##                                                                  parathyroid                                                                         PBMC 
##                                                                           15                                                                          197 
##                                                             peripheral blood                                                             Peripheral blood 
##                                                                          771                                                                          194 
##                                           Peripheral blood mononuclear cells                                                            Peripheral Retina 
##                                                                           20                                                                           16 
##                                                Peripheral RPE-Choroid-Sclera                                                      Peripheral venous blood 
##                                                                           15                                                                           20 
##                                                                   PERITONEUM                                                                          PFC 
##                                                                            1                                                                           60 
##                                                            pheripheral blood                                                                     Placenta 
##                                                                           18                                                                           32 
##                                                               Plasma exosome                                                                       pleura 
##                                                                            6                                                                           11 
##                   Postmortem brain\\, dorsolateral prefrontal cortex (DLPFC)                                Postmortem Cervical Spinal Section Homogenate 
##                                                                           52                                                                           15 
##                                                                    Precuneus                                                            prefrontal cortex 
##                                                                           10                                                                           36 
##                                                               Prepouch ileum                                                        primary breast tumour 
##                                                                           36                                                                           18 
##                                                         Primary Heart Tissue                                               Primary nasal epithelial cells 
##                                                                           22                                                                            8 
##                                                     primary skin fibroblasts                                                                primary tumor 
##                                                                           24                                                                           36 
##                   primary tumor tissue from high grade serous ovarian tumors                                                        Primary visual cortex 
##                                                                           72                                                                           10 
##                                                                     prostate                                                              prostate cancer 
##                                                                           83                                                                          114 
##                                       prostate gland (matched normal tissue)                                                               prostate tumor 
##                                                                           28                                                                           16 
##                                                                     Pterygia                                                             Pulmonary artery 
##                                                                           12                                                                            2 
##                                                           Purified platelets                                                                      Putamen 
##                                                                            1                                                                          131 
##                                                                          rec                                                                          Rec 
##                                                                            2                                                                           26 
##                                                                       rectum                                                                 Renal artery 
##                                                                           76                                                                            7 
##                                                         Renal cell carcinoma                                                                       retina 
##                                                                           20                                                                           16 
##                                                                 Right Atrium                                                                   Right Lung 
##                                                                            8                                                                           40 
##                                                               salivary_gland                                                                           SC 
##                                                                            2                                                                            9 
##                                                                        Serum                                                                sigmoid colon 
##                                                                           12                                                                           25 
##                                                              Skeletal muscle                                                                         skin 
##                                                                           16                                                                          182 
##                                                                         Skin                                                                Skin biopsies 
##                                                                           83                                                                           36 
##                                                                  skin tissue                                                              small_intestine 
##                                                                            6                                                                            1 
##                                                                  soft_tissue                                                         Stem cell population 
##                                                                           19                                                                            8 
##                                                                      stomach                                                            Substanstia nigra 
##                                                                           63                                                                           34 
##                                                                           TC                                                                       testes 
##                                                                            2                                                                            1 
##                                                                       testis                                                                       Testis 
##                                                                            1                                                                           14 
##                                                                      thyroid                                                                      Thyroid 
##                                                                           12                                                                           19 
##                                                                           TI                                                                       Tongue 
##                                                                            2                                                                            6 
##                                                                       Tonsil                                                             Trachea/Bronchus 
##                                                                            6                                                                           12 
##                                   triple negative breast cancer tumor (FFPE)                                                                        tumor 
##                                                                           42                                                                           15 
##                                                                        Tumor                                                                 Tumor tissue 
##                                                                            4                                                                           15 
##                                  Tumor\\, unspecified of the large intestine                                                             Umbilical artery 
##                                                                            1                                                                            2 
##                                                               Umbilical vein                                                    upper_aerodigestive_tract 
##                                                                            3                                                                           32 
##                                                                urinary_tract                                                 uterine clear cell carcinoma 
##                                                                           26                                                                            4 
##                                                                       uterus                                                                       Uterus 
##                                                                           36                                                                            2 
##                                                                      vaginal                                                      vastus lateralis muscle 
##                                                                           16                                                                          132 
##                                             vastus lateralis skeletal muscle                                                                       vessel 
##                                                                          671                                                                           20 
##                                                                  whole blood                                                                  Whole blood 
##                                                                          445                                                                          348 
##                                                                  Whole Blood                                                                   Whole skin 
##                                                                           56                                                                           16
table(allSRAFinal$tissue_type) # Useful
## 
##                          Adipose                   Adipose Tissue                    Adrenal Gland                            Aorta                          Bladder 
##                                2                                1                                2                                2                                1 
##                        Esophagus                          Gastric                   Left Ventricle                            Liver                             Lung 
##                                2                                3                                2                                1                                2 
##                            Ovary                         Pancreas                     Psoas Muscle                     Right Atrium                  Right Ventricle 
##                                1                                2                                3                                1                                2 
##                    Sigmoid Colon                  Small Intestine                           Spleen the endothelium of blood vessels                           Thymus 
##                                3                                3                                3                               26                                1
table(allSRAFinal$Sample_type) # Somewhat useful
## 
##                 blood tissue sample                        cell culture                        Cell culture Cell Culture Transcriptome Sequence                    cortical neurons 
##                                  16                                1036                                  24                                  29                                   4 
##                              frozen               hiPSC-derived neurons                           leukocyte                 liquid cell culture      Plasma sample\\, platelet-poor 
##                                   3                                  28                                  12                                   8                                  68 
##                        Primary Cell                        Serum sample                       the cirrhotic                   the liver tissues                              Tissue 
##                                  42                                  12                                   9                                   9                                   6 
##                       tissue sample 
##                                  15
table(allSRAFinal$Sample.Name) # Nope
## < table of extent 0 >
table(allSRAFinal$source_name) # Useful
## 
##                                                                                 12-cell blastomere 
##                                                                                                  6 
##                                                                                 16-cell blastomere 
##                                                                                                  7 
##                                                                                  4 mm punch biopsy 
##                                                                                                 16 
##                                                                                  4-cell blastomere 
##                                                                                                  5 
##                                                                                  8-cell blastomere 
##                                                                                                 23 
##                                                                                    Active_Baseline 
##                                                                                                 29 
##                                                                                         Active_M30 
##                                                                                                  2 
##                                                                                      Active_Wk 104 
##                                                                                                 18 
##                                                                                Active_Wk 104 (130) 
##                                                                                                  1 
##                                                                             Active_Wk 104 (or M30) 
##                                                                                                  1 
##                                                                                       Active_Wk 26 
##                                                                                                 28 
##                                                                                       Active_Wk 52 
##                                                                                                 30 
##                                                                                       Active_Wk 78 
##                                                                                                 26 
##                                                                                            adipose 
##                                                                                                 50 
##                                                                                    Adjacent normal 
##                                                                                                  4 
##                                                                        Aged A_peripheral blood HSC 
##                                                                                                 94 
##                                                                        Aged B_peripheral blood HSC 
##                                                                                                 83 
##                                                                        Aged C_peripheral blood HSC 
##                                                                                                 94 
##                                                                        Aged D_peripheral blood HSC 
##                                                                                                 94 
##                                                                       Air Liquid Interface culture 
##                                                                                                 36 
##                                                                              AnCg_Bipolar Disorder 
##                                                                                                 24 
##                                                                                       AnCg_Control 
##                                                                                                 24 
##                                                                              AnCg_Major Depression 
##                                                                                                 24 
##                                                                                 AnCg_Schizophrenia 
##                                                                                                 24 
##                                   articular cartilage of the femoral condyle and/or tibial plateau 
##                                                                                                  3 
##                                                                                           ASC_Pop2 
##                                                                                                 11 
##                                                                                           ASC_Pop3 
##                                                                                                 11 
##                                                                                           ASC_PopA 
##                                                                                                 10 
##                                                                                           ASC_PopB 
##                                                                                                 11 
##                                                                                           ASC_PopD 
##                                                                                                 10 
##                                                                                                  B 
##                                                                                                 18 
##                                                                                       B-Lymphocyte 
##                                                                                                108 
##                                                                                              Bcell 
##                                                                                                 28 
##                                                                                              blood 
##                                                                                                350 
##                                                                                              Blood 
##                                                                                               1386 
##                                                                                      Blood\\, case 
##                                                                                                110 
##                                                                                   Blood\\, control 
##                                                                                                245 
##                                                                                  bone tissue cells 
##                                                                                                  4 
##                                                                                              Brain 
##                                                                                                 89 
##                                                                                             Breast 
##                                                                                                196 
##                                                                                      Breast tissue 
##                                                                                                 60 
##                                                                                   Brodmann Area 38 
##                                                                                                 53 
##                                                                             bronchoalveolar lavage 
##                                                                                                100 
##                                                                                            Caudate 
##                                                                                                 72 
##                                                                                                CD4 
##                                                                                                 56 
##                                                                                                CD8 
##                                                                                                 54 
##                                                                                CD8+ sorted T-cells 
##                                                                                                 63 
##                                                                        CKD_vastus lateralis muscle 
##                                                                                                  6 
##                                                                                   Clear cell tumor 
##                                                                                                 13 
##                                                                                              Colon 
##                                                                                                 53 
##                                                                                            control 
##                                                                                                  8 
##                                                                                      Control_Ileum 
##                                                                                                 37 
##                                                                    Control_vastus lateralis muscle 
##                                                                                                  3 
##                                                                      Control\\, classical monocyte 
##                                                                                                 18 
##                                                                 coronary artery calcification case 
##                                                                                                  8 
##                                                                                     COVID-19 blood 
##                                                                                                  4 
##                                                                                        COVID-19 TA 
##                                                                                                  4 
##                                                                       Crohn's disease_Ileal biopsy 
##                                                                                                210 
##                                                                              Crohn's Disease_Ileum 
##                                                                                                112 
##                                                                   Dialysis_vastus lateralis muscle 
##                                                                                                  4 
##                                                                             DLPFC_Bipolar Disorder 
##                                                                                                 23 
##                                                                                      DLPFC_Control 
##                                                                                                 24 
##                                                                             DLPFC_Major Depression 
##                                                                                                 23 
##                                                                                DLPFC_Schizophrenia 
##                                                                                                 24 
##                                                                                Donor_1 HepLPCs-Hep 
##                                                                                                  1 
##                                                                                       Donor_1 PHCs 
##                                                                                                  1 
##                                                                                  Donor_1 TEM Day10 
##                                                                                                  1 
##                                                                                   Donor_1 TEM Day4 
##                                                                                                  1 
##                                                                                    Donor_1 TEM P10 
##                                                                                                  1 
##                                                                                     Donor_1 TEM P5 
##                                                                                                  1 
##                                                                                       Donor_2 PHCs 
##                                                                                                  1 
##                                                                                  Donor_2 TEM Day10 
##                                                                                                  1 
##                                                                                   Donor_2 TEM Day4 
##                                                                                                  1 
##                                                                                    Donor_2 TEM P10 
##                                                                                                  1 
##                                                                                     Donor_2 TEM P5 
##                                                                                                  1 
##                                                                                Donor_3 HepLPCs-Hep 
##                                                                                                  1 
##                                                                                           Duodenum 
##                                                                                                 32 
##                                                                                     FALLOPIAN TUBE 
##                                                                                                  8 
##                                                                                         Fibroblast 
##                                                                                                 32 
##                                                                           fibroblast derived iPSCs 
##                                                                                                  6 
##                                                                  frontal cortex of the human brain 
##                                                                                                 34 
##                                                                full-thickness sigmoid colon tissue 
##                                                                                                 25 
##                                                                                       Glioblastoma 
##                                                                                                 12 
##                                                                                           HC blood 
##                                                                                                 17 
##                                                                                    Healthy control 
##                                                                                                 11 
##                                                                              Healthy control blood 
##                                                                                                  6 
##                                                                    Healthy donor CD34+ Bone Marrow 
##                                                                                                 12 
##                                                                                       healthy skin 
##                                                                                                  1 
##                                                                                              Heart 
##                                                                                                 71 
##                                                                                           hematoma 
##                                                                                                178 
##                                                                                    Huh-7 Cell Line 
##                                                                                                 12 
##                                                                                         HuH7 1D142 
##                                                                                                  3 
##                                                                   Human Aortic smooth muscle cells 
##                                                                                                  8 
##                                                                    Human colorectal adenocarcinoma 
##                                                                                                  6 
##                                                                   human Dorsal Root Ganglia (hDRG) 
##                                                                                                  1 
##                                                               human induced pluripotent stem cells 
##                                                                                                317 
##                                                                           human omental metastasis 
##                                                                                                129 
##                                                                      Human postmortem brain sample 
##                                                                                                 34 
##                                                               Human skin microvascular endothelium 
##                                                                                                  4 
##                                                                                          IBS_Ileum 
##                                                                                                  1 
##                                                              Ideopathic pulmonary disease\\, PBMCs 
##                                                                                                 54 
##                                                                                              Ileum 
##                                                                                                 47 
##                                                                                             Ileum1 
##                                                                                                  1 
##                                                                                             Ileum2 
##                                                                                                  1 
##                                                               In vitro differentiated plasma cells 
##                                                                                                 53 
##                                               induced pluripotent stem cell-derived cardiomyocytes 
##                                                                                                 44 
##                                             induced pluripotent stem cell-derived cortical neurons 
##                                                                                                 22 
##                                                                     induced pluripotent stem cells 
##                                                                                                 18 
##                                                                                    inner cell mass 
##                                                                                                 13 
##                                                                 Insulin resistant_iPSC_no treament 
##                                                                                                  6 
##                                                                     Insulin resistant_iPSC_treated 
##                                                                                                  6 
##                                                                 Insulin sensitive_iPSC_no treament 
##                                                                                                  6 
##                                                                     Insulin sensitive_iPSC_treated 
##                                                                                                  6 
##                                                                                               iPSC 
##                                                                                                 11 
##                                                                 iPSC-derived cardiomyocytes day 15 
##                                                                                                 16 
##                                                  iPSC-derived cardiomyocytes day 27 Not T3 treated 
##                                                                                                  8 
##                                                      iPSC-derived cardiomyocytes day 27 T3 treated 
##                                                                                                 27 
##                                                                                             keloid 
##                                                                                                  4 
##                                                                                          Left Lung 
##                                                                                                 60 
##                                                                                     Left Ventricle 
##                                                                                                361 
##                                                                         Left Ventricle Endocardium 
##                                                                                                 11 
##                                                                                Leicester_Active_TB 
##                                                                                                 53 
##                                                                                  Leicester_Control 
##                                                                                                 50 
##                                                                                     Leicester_LTBI 
##                                                                                                 49 
##                                                                                              Liver 
##                                                                                                 16 
##                                                                                                 LN 
##                                                                                                 66 
##                                                       Longitudnal_Leicester_Control_Non_progressor 
##                                                                                                 69 
##                                                          Longitudnal_Leicester_LTBI_Non_progressor 
##                                                                                                 69 
##                                                              Longitudnal_Leicester_LTBI_Progressor 
##                                                                                                 23 
##                                                                                     Lower leg skin 
##                                                                                                 20 
##                                                                                               Lung 
##                                                                                                 66 
##                                                                lung microvascular endothelial cell 
##                                                                                                 16 
##                                                                                        Lymphocytes 
##                                                                                                 55 
##                                                                                         Macrophage 
##                                                                                                 96 
##                                                                       Maternal Whole Blood_Chinese 
##                                                                                                146 
##                                                                        Maternal Whole Blood_Indian 
##                                                                                                 44 
##                                                                         Maternal Whole Blood_Malay 
##                                                                                                 48 
##                                                                                   metastatic tumor 
##                                                                                                 11 
##                                                                                           monocyte 
##                                                                                                 28 
##                                                                                          Monocytes 
##                                                                                                 18 
##                                                                                     Muscle (ileum) 
##                                                                                                  3 
##                                                                                       Muscle-Colon 
##                                                                                                  3 
##                                                                                        Muscle-Duod 
##                                                                                                  3 
##                                                                                       Muscle-Ileum 
##                                                                                                  3 
##                                                                                                Nac 
##                                                                                                 71 
##                                                                NAc\\, Caudate\\, and Putamen_pair1 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair103 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair105 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair107 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair109 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair11 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair111 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair115 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair117 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair119 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair129 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair133 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair139 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair145 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair149 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair15 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair153 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair157 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair159 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair163 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair167 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair171 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair173 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair177 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair183 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair187 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair19 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair193 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair195 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair197 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair201 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair205 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair207 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair21 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair211 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair213 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair217 
##                                                                                                  3 
##                                                              NAc\\, Caudate\\, and Putamen_pair219 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair25 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair27 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair29 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair37 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair39 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair41 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair43 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair53 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair59 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair63 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair65 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair67 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair69 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair73 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair77 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair81 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair83 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair87 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair89 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair95 
##                                                                                                  3 
##                                                               NAc\\, Caudate\\, and Putamen_pair99 
##                                                                                                  3 
##                                                                              nAcc_Bipolar Disorder 
##                                                                                                 24 
##                                                                                       nAcc_Control 
##                                                                                                 22 
##                                                                              nAcc_Major Depression 
##                                                                                                 22 
##                                                                                 nAcc_Schizophrenia 
##                                                                                                 23 
##                                                                                       Naïve B cell 
##                                                                                                 21 
##                                                                                          Nasal RNA 
##                                                                                                 65 
##                                                                                       nasal_tissue 
##                                                                                                  6 
##                                                            Neurons isolated from post-mortem dlPFC 
##                                                                                                 36 
##                                                                                         Neutrophil 
##                                                                                                 18 
##                                                                       Non-IBD control_Ileal biopsy 
##                                                                                                 35 
##                                                                                        normal scar 
##                                                                                                  3 
##                                                                   normal tissue matched to tumor 1 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 10 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 11 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 12 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 13 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 14 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 15 
##                                                                                                  1 
##                                                                  normal tissue matched to tumor 16 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 2 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 3 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 4 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 5 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 6 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 7 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 8 
##                                                                                                  1 
##                                                                   normal tissue matched to tumor 9 
##                                                                                                  1 
##                                                       normal-weight child with asthma_CD4+ T cells 
##                                                                                                 21 
##                                                               obese child with asthma_CD4+ T cells 
##                                                                                                 21 
##                                                                                            OMENTUM 
##                                                                                                  9 
##                                                                                        oral mucosa 
##                                                                                                 66 
##                                                                          Organotypic raft cultures 
##                                                                                                 23 
##                                                                                              OVARY 
##                                                                                                 10 
##                                                                                        OVARY TUMOR 
##                                                                                                  8 
##                                                                                              Overy 
##                                                                                                 10 
##                                                                                             P427.5 
##                                                                                                  2 
##                                                                                             P427.6 
##                                                                                                  1 
##                                                                                             P427.7 
##                                                                                                  2 
##                                                                                            P500.12 
##                                                                                                  2 
##                                                                                             P500.5 
##                                                                                                  2 
##                                                                                             P500.8 
##                                                                                                  2 
##                                                                                             P501.5 
##                                                                                                  2 
##                                                                                             P501.6 
##                                                                                                  2 
##                                                                                             P501.7 
##                                                                                                  2 
##                                                                                            P502.12 
##                                                                                                  3 
##                                                                                             P502.4 
##                                                                                                  2 
##                                                                                             P502.8 
##                                                                                                  2 
##                                                                                             P503.5 
##                                                                                                  1 
##                                                                                             P503.6 
##                                                                                                  2 
##                                                                                             P503.7 
##                                                                                                  2 
##                                                                                             P505.5 
##                                                                                                  1 
##                                                                                             P505.6 
##                                                                                                  2 
##                                                                                             P505.7 
##                                                                                                  2 
##                                                                                             P506.5 
##                                                                                                  2 
##                                                                                             P506.6 
##                                                                                                  1 
##                                                                                             P506.7 
##                                                                                                  2 
##                                                                                             P507.5 
##                                                                                                  2 
##                                                                                             P508.5 
##                                                                                                  2 
##                                                                                             P508.6 
##                                                                                                  2 
##                                                                                             P509.5 
##                                                                                                  2 
##                                                                                             P510.5 
##                                                                                                  2 
##                                                                                             P511.5 
##                                                                                                  2 
##                                                                                             P511.6 
##                                                                                                  2 
##                                                                                             P512.5 
##                                                                                                  1 
##                                                                                             P513.5 
##                                                                                                  2 
##                                                                                             P600.5 
##                                                                                                  2 
##                                                                                             P600.6 
##                                                                                                  2 
##                                                                                               P601 
##                                                                                                  2 
##                                                                                               P602 
##                                                                                                  4 
##                                                                                               P603 
##                                                                                                  2 
##                                                                                             P604.6 
##                                                                                                  2 
##                                                                                             P604.7 
##                                                                                                  1 
##                                                                                             P607.5 
##                                                                                                  2 
##                                                                                               P701 
##                                                                                                  2 
##                                                                                      PAXgene blood 
##                                                                                               1857 
##                                                                                               PBMC 
##                                                                                                346 
##                                                                                              PBMCs 
##                                                                                                280 
##                                                                        Peripheral blood leukocytes 
##                                                                                                  8 
##                                                                 peripheral blood mononuclear cells 
##                                                                                                 24 
##                                                                                  Peripheral Retina 
##                                                                                                 16 
##                                                                      Peripheral RPE-Choroid-Sclera 
##                                                                                                 15 
##                                                                            Peripheral venous blood 
##                                                                                                 20 
##                                                                                         PERITONEUM 
##                                                                                                  1 
##                                                                                    Pituitary gland 
##                                                                                                  7 
##                                                                                   Placebo_Baseline 
##                                                                                                 13 
##                                                                                        Placebo_M30 
##                                                                                                  2 
##                                                                                     Placebo_Wk 104 
##                                                                                                  8 
##                                                                                      Placebo_Wk 26 
##                                                                                                 10 
##                                                                                      Placebo_Wk 52 
##                                                                                                 12 
##                                                                                      Placebo_Wk 78 
##                                                                                                 15 
##                                                          Post-mortem dosolateral prefrontal cortex 
##                                                                                                 34 
##                                                                          Postmortem Brain\\, DLPFC 
##                                                                                                 52 
##                                                                               Primary Heart Tissue 
##                                                                                                 22 
##                                                                       primary ovarian tumor tissue 
##                                                                                                 72 
##                                                                                      primary tumor 
##                                                                                                550 
##                                                                               Prostate Core Biopsy 
##                                                                                                 32 
##                                                                                          Pterygium 
##                                                                                                 12 
##                                                                              Pulmonary sarcoidosis 
##                                                                                                  8 
##                                                                                            Putamen 
##                                                                                                 72 
##                                                                                             Rectum 
##                                                                                                  2 
##                                                                                             retina 
##                                                                                                  8 
##                                                                                             Retina 
##                                                                                                  8 
##                                                                    right atrial appendage biopsies 
##                                                                                                166 
##                                                                                         Right Lung 
##                                                                                                 40 
##                                                                    RNA from cardiac left ventricle 
##                                                                                                 32 
##                                                                         RNA from white blood cells 
##                                                                                                 67 
##                                                                                            RNA-seq 
##                                                                                                 25 
##  sensory neurons induced by long term NGN2-BRN3A expression in iPSC-derived neural crest (NC_iSN1) 
##                                                                                                  3 
##                                    sensory neurons induced by NGN2-BRN3A expression in iPSCs (iSN) 
##                                                                                                  3 
## sensory neurons induced by short term NGN2-BRN3A expression in iPSC-derived nueral crest (NC_iSN2) 
##                                                                                                  3 
##                                                                                               Skin 
##                                                                                                 48 
##                                                                                        Skin biopsy 
##                                                                                                 13 
##                                                                     skin fibroblasts from biopsies 
##                                                                                                  5 
##                                                                   small airway epithelium brushing 
##                                                                                                 12 
##                                                                                        SPTB female 
##                                                                                                  8 
##                                                                                          SPTB male 
##                                                                                                  8 
##                                                                          SSc\\, classical monocyte 
##                                                                                                 16 
##                                                                                    Synovial Tissue 
##                                                                                                 92 
##                                                                                        Term female 
##                                                                                                  8 
##                                                                                          Term male 
##                                                                                                  8 
##                                                                                             Tissue 
##                                                                                                 53 
##                                                         triple negative breast cancer tumor (FFPE) 
##                                                                                                 42 
##                                                                                      trophectoderm 
##                                                                                                  9 
##                                                                                              tumor 
##                                                                                                  1 
##                                                                                              Tumor 
##                                                                                                  3 
##                                                                                            tumor 1 
##                                                                                                  1 
##                                                                                           tumor 10 
##                                                                                                  1 
##                                                                                           tumor 11 
##                                                                                                  1 
##                                                                                           tumor 12 
##                                                                                                  1 
##                                                                                           tumor 13 
##                                                                                                  1 
##                                                                                           tumor 14 
##                                                                                                  1 
##                                                                                           tumor 15 
##                                                                                                  1 
##                                                                                           tumor 16 
##                                                                                                  1 
##                                                                                            tumor 2 
##                                                                                                  1 
##                                                                                            tumor 3 
##                                                                                                  1 
##                                                                                            tumor 4 
##                                                                                                  1 
##                                                                                            tumor 5 
##                                                                                                  1 
##                                                                                            tumor 6 
##                                                                                                  1 
##                                                                                            tumor 7 
##                                                                                                  1 
##                                                                                            tumor 8 
##                                                                                                  1 
##                                                                                            tumor 9 
##                                                                                                  1 
##                                                                              tumor adjacent normal 
##                                                                                                  1 
##                                                                                      U3A cell line 
##                                                                                                 18 
##                                                                           Ulcerative Colitis_Ileum 
##                                                                                                 44 
##                                                                             Unclassified IBD_Ileum 
##                                                                                                  1 
##                                             undifferentiated induced pluripotent stem cells (iPSC) 
##                                                                                                  3 
##                                                                unknown primary or metastatic tumor 
##                                                                                                 13 
##                                                                      Urothelial cancer FFPE tissue 
##                                                                                                 52 
##                                                                                             Uterus 
##                                                                                                 35 
##                                                                            vastus lateralis muscle 
##                                                                                                119 
##                                                                                Vena femoralis left 
##                                                                                                 13 
##                                                                               Vena femoralis right 
##                                                                                                  4 
##                                                                               Vena femoralis Right 
##                                                                                                 16 
##                                                                               Vena sbclavian right 
##                                                                                                  8 
##                                                                               Vena subclavian left 
##                                                                                                  1 
##                                                                               Vena Subclavian left 
##                                                                                                  8 
##                                                                              Vena subclavian right 
##                                                                                                  3 
##                                                                              Vena Subclavian Right 
##                                                                                                  1 
##                                                                                        whole blood 
##                                                                                               1028 
##                                                                                        Whole blood 
##                                                                                                253 
##                                                                                        Whole Blood 
##                                                                                                113 
##                                                                              whole blood leukocyte 
##                                                                                                 37 
##                                                              whole blood sample\\, healthy control 
##                                                                                                 45 
##                                                   whole blood sample\\, systemic sclerosis patient 
##                                                                                                232 
##                                                                   Whole blood_activity_tertile_one 
##                                                                                                  7 
##                                                                 Whole blood_activity_tertile_three 
##                                                                                                  5 
##                                                                   Whole blood_activity_tertile_two 
##                                                                                                  9 
##                                                                          Yg A_peripheral blood HSC 
##                                                                                                 94 
##                                                                          Yg B_peripheral blood HSC 
##                                                                                                 83 
##                                                                          Yg C_peripheral blood HSC 
##                                                                                                 94 
##                                                                          Yg D_peripheral blood HSC 
##                                                                                                 94
table(allSRAFinal$source_material_cell_type) # goes with the next column
## 
## Fibroblast 
##        332
table(allSRAFinal$source_material) #
## 
## Skin tissue 
##         332
table(allSRAFinal$SAMPLE_TYPE) # Somewhat useful
## 
##                                12-cell blastomere                                16-cell blastomere                                 4-cell blastomere 
##                                                 6                                                 7                                                 5 
##                                 8-cell blastomere                                              cell                                      cell culture 
##                                                23                                                12                                               190 
##                                      Cell Culture                 Cell culture Biological Replica 1                 Cell culture Biological Replica 2 
##                                                49                                                 7                                                 7 
##                 Cell culture Biological Replica 3               Day 0 Biological Replicate A WTC-11               Day 0 Biological Replicate B WTC-11 
##                                                 7                                                 1                                                 1 
##               Day 0 Biological Replicate C WTC-11              Day 13 Biological Replicate A WTC-11              Day 13 Biological Replicate B WTC-11 
##                                                 1                                                 1                                                 1 
##              Day 13 Biological Replicate C WTC-11              Day 18 Biological Replicate A WTC-11              Day 18 Biological Replicate B WTC-11 
##                                                 1                                                 1                                                 1 
##              Day 18 Biological Replicate C WTC-11              Day 28 Biological Replicate A WTC-11              Day 28 Biological Replicate B WTC-11 
##                                                 1                                                 1                                                 1 
##              Day 28 Biological Replicate C WTC-11               Day 7 Biological Replicate A WTC-11               Day 7 Biological Replicate B WTC-11 
##                                                 1                                                 1                                                 1 
##               Day 7 Biological Replicate C WTC-11                               DNA-seq and RNA-seq                                              FFPE 
##                                                 1                                                16                                                17 
##                                   inner cell mass                  iPSC- derived cerebral organoids                           iPSC-derived cell types 
##                                                13                                                24                                               304 
## Laser Capture Microdissection followed by RNA-seq                                         leukocyte                                          organoid 
##                                               258                                                32                                                39 
##                                              PBMC                                   PBMC from blood                                     polysomal RNA 
##                                                 5                                                36                                                 6 
##                                       Post-mortem                                          Resected                                               RNA 
##                                                20                                                30                                                82 
##                                           RNA-seq                                            RNASeq                                       single cell 
##                                                 8                                                 1                                                47 
##                                            Tissue                        tissue cytological samples                                     tissue sample 
##                                                38                                                17                                              1040 
##                                         total RNA                                     trophectoderm                                       whole blood 
##                                                 6                                                 9                                                60 
##                                       Whole Blood 
##                                                56
table(allSRAFinal$Sampling_site) # A single study once again...
## 
##  adrenal gland      bile duct        bladder           bone          brain         breast     chest wall          colon    endometrium         kidney          liver           lung 
##              1              2             16              1              1             15              3              1              1              1              4             33 
##     lymph node         muscle        omentum          other          ovary       pancreas pleural cavity           skin    soft tissue 
##             11              1              2              1              1              3              2              1              4
# The most useful ones are gonna be Tissue, Cell_type and source_name, I think
(table(is.na(allSRAFinal$Tissue), is.na(allSRAFinal$Cell_type)))
##        
##         FALSE  TRUE
##   FALSE  2568 13886
##   TRUE   2333  6601
head(allSRAFinal[!is.na(allSRAFinal$Tissue) & !is.na(allSRAFinal$Cell_type), ])
##       SRA.Study Center.Name    BioSample     Organism  LibrarySource Cell_type Tissue Cell_line disease ETHNICITY disease_state Diagnosis tissue_type clinical_information
## 58488 SRP166898         GEO SAMN10315825 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 58489 SRP166898         GEO SAMN10315821 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 58490 SRP166898         GEO SAMN10315820 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 58491 SRP166898         GEO SAMN10315934 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 58492 SRP166898         GEO SAMN10315933 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 58493 SRP166898         GEO SAMN10315931 Homo sapiens TRANSCRIPTOMIC       Tfh   PBMC      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
##                            RACE health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id Sample_type source_name
## 58488                     White         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
## 58489                     White         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
## 58490                     White         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
## 58491 Black or African American         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
## 58492 Black or African American         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
## 58493                     White         <NA>       <NA>                <NA>            <NA>          <NA>  2018-12-27 2018-10-25 17:04:00     <NA>     <NA>        <NA>        PBMC
##       source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site                 finalRace finalGeography hispanic
## 58488                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>                     White           <NA>     <NA>
## 58489                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>                     White           <NA>     <NA>
## 58490                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>                     White           <NA>     <NA>
## 58491                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 58492                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 58493                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>                     White           <NA>     <NA>
(table(is.na(allSRAFinal$Tissue), is.na(allSRAFinal$source_name)))
##        
##         FALSE TRUE
##   FALSE  7303 9151
##   TRUE   7167 1767
head(allSRAFinal[!is.na(allSRAFinal$Tissue) & !is.na(allSRAFinal$source_name), ])
##       SRA.Study Center.Name    BioSample     Organism  LibrarySource Cell_type   Tissue Cell_line disease ETHNICITY disease_state Diagnosis tissue_type clinical_information
## 48644 SRP043166         GEO SAMN02852956 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 48645 SRP043166         GEO SAMN02852957 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 48646 SRP043166         GEO SAMN02852961 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 48647 SRP043166         GEO SAMN02852955 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 48648 SRP043166         GEO SAMN02852959 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 48649 SRP043166         GEO SAMN02852964 Homo sapiens TRANSCRIPTOMIC      <NA> Pterygia      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
##            RACE health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id Sample_type source_name
## 48644 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 02:16:00     <NA>     <NA>        <NA>   Pterygium
## 48645 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 02:16:00     <NA>     <NA>        <NA>   Pterygium
## 48646 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 02:16:00     <NA>     <NA>        <NA>   Pterygium
## 48647 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 02:15:00     <NA>     <NA>        <NA>   Pterygium
## 48648 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 02:16:00     <NA>     <NA>        <NA>   Pterygium
## 48649 Caucasian         <NA>       <NA>                <NA>            <NA>          <NA>  2015-07-22 2015-12-09 01:31:00     <NA>     <NA>        <NA>   Pterygium
##       source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site finalRace finalGeography hispanic
## 48644                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
## 48645                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
## 48646                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
## 48647                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
## 48648                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
## 48649                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>     White           <NA>     <NA>
(table(is.na(allSRAFinal$Cell_type), is.na(allSRAFinal$source_name)))
##        
##         FALSE  TRUE
##   FALSE  2824  2077
##   TRUE  11646  8841
head(allSRAFinal[!is.na(allSRAFinal$Cell_type) & !is.na(allSRAFinal$source_name), ])
##       SRA.Study Center.Name    BioSample     Organism  LibrarySource           Cell_type Tissue Cell_line disease ETHNICITY disease_state Diagnosis tissue_type
## 48582 SRP120018         GEO SAMN07789039 Homo sapiens TRANSCRIPTOMIC             HepLPCs   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
## 48583 SRP120018         GEO SAMN07789036 Homo sapiens TRANSCRIPTOMIC Primary hepatocytes   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
## 48656 SRP120018         GEO SAMN07789049 Homo sapiens TRANSCRIPTOMIC         HepLPCs-Hep   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
## 48657 SRP120018         GEO SAMN07789046 Homo sapiens TRANSCRIPTOMIC             HepLPCs   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
## 48658 SRP120018         GEO SAMN07789037 Homo sapiens TRANSCRIPTOMIC             HepLPCs   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
## 48659 SRP120018         GEO SAMN07789041 Homo sapiens TRANSCRIPTOMIC             HepLPCs   <NA>      <NA>    <NA>     Asian          <NA>      <NA>        <NA>
##       clinical_information RACE health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id Sample_type
## 48582                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-24 11:32:00     <NA>     <NA>        <NA>
## 48583                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-16 12:24:00     <NA>     <NA>        <NA>
## 48656                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-16 12:11:00     <NA>     <NA>        <NA>
## 48657                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-16 12:12:00     <NA>     <NA>        <NA>
## 48658                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-16 12:10:00     <NA>     <NA>        <NA>
## 48659                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2017-10-30 2017-10-16 12:05:00     <NA>     <NA>        <NA>
##               source_name source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site finalRace finalGeography hispanic
## 48582      Donor_1 TEM P5                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>
## 48583        Donor_2 PHCs                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>
## 48656 Donor_1 HepLPCs-Hep                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>
## 48657   Donor_1 TEM Day10                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>
## 48658   Donor_2 TEM Day10                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>
## 48659      Donor_2 TEM P5                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>           Asia     <NA>

The reason I went down this rabbit hole was that there were about 3000 samples that didn’t have tissue info if we focused on the most complete categories: Cell_type and Tissue. I checked some manually and most of them seemed to have info in source_name and Organism_part, which I then rescued from the original SRA downloads… so let’s look at whether we really need the info from the other columns too:

head(allSRAFinal[is.na(allSRAFinal$Tissue) & is.na(allSRAFinal$Cell_type), ])
##       SRA.Study Center.Name    BioSample     Organism  LibrarySource Cell_type Tissue Cell_line disease ETHNICITY disease_state Diagnosis tissue_type clinical_information
## 49372 SRP191569         GEO SAMN11372981 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 49382 SRP191569         GEO SAMN11372986 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 49383 SRP191569         GEO SAMN11372985 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 49384 SRP191569         GEO SAMN11372984 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 49385 SRP191569         GEO SAMN11372983 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
## 49386 SRP191569         GEO SAMN11372980 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA>
##                   RACE health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id Sample_type source_name
## 49372 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:25:00     <NA>     <NA>        <NA>       PBMCs
## 49382 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:21:00     <NA>     <NA>        <NA>       PBMCs
## 49383 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:21:00     <NA>     <NA>        <NA>       PBMCs
## 49384 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:21:00     <NA>     <NA>        <NA>       PBMCs
## 49385 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:23:00     <NA>     <NA>        <NA>       PBMCs
## 49386 African American         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-24 2019-04-09 14:22:00     <NA>     <NA>        <NA>       PBMCs
##       source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site                 finalRace finalGeography hispanic
## 49372                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 49382                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 49383                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 49384                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 49385                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
## 49386                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA> Black or African American           <NA>     <NA>
head(allSRAFinal[is.na(allSRAFinal$Tissue) & is.na(allSRAFinal$Cell_type) & is.na(allSRAFinal$source_name), ])
##        SRA.Study               Center.Name    BioSample     Organism  LibrarySource Cell_type Tissue Cell_line                              disease ETHNICITY disease_state
## 188456 ERP116722 Exeter Sequencing Service SAMEA5854572 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>                     cerebral malaria  Mandingo          <NA>
## 188457 ERP116722 Exeter Sequencing Service SAMEA5854573 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>                     cerebral malaria      Jola          <NA>
## 188459 ERP116722 Exeter Sequencing Service SAMEA5854574 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>                     cerebral malaria  Mandingo          <NA>
## 188460 ERP116722 Exeter Sequencing Service SAMEA5854575 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>                     cerebral malaria     Fulla          <NA>
## 188461 ERP116722 Exeter Sequencing Service SAMEA5854576 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>                     cerebral malaria     Fulla          <NA>
## 188463 ERP116722 Exeter Sequencing Service SAMEA5854577 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA> cerebral malaria and hyperlactatemia      Jola          <NA>
##        Diagnosis tissue_type clinical_information RACE health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id
## 188456      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2022-07-24 09:35:00     <NA>     <NA>
## 188457      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2021-10-10 16:52:00     <NA>     <NA>
## 188459      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2021-10-10 18:19:00     <NA>     <NA>
## 188460      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2022-07-18 16:45:00     <NA>     <NA>
## 188461      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2021-10-10 17:04:00     <NA>     <NA>
## 188463      <NA>        <NA>                 <NA> <NA>         <NA>       <NA>                <NA>            <NA>          <NA>  2019-08-07 2022-07-15 18:32:00     <NA>     <NA>
##        Sample_type source_name source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site finalRace    finalGeography hispanic
## 188456        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 188457        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 188459        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 188460        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 188461        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 188463        <NA>        <NA>                      <NA>            <NA>         blood           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
dim(allSRAFinal[is.na(allSRAFinal$Tissue) & is.na(allSRAFinal$Cell_type) & is.na(allSRAFinal$source_name) & is.na(allSRAFinal$tissue_type) & is.na(allSRAFinal$Organism_part), ])
## [1] 464  36
noInfo <- allSRAFinal[is.na(allSRAFinal$Tissue) & is.na(allSRAFinal$Cell_type) & is.na(allSRAFinal$source_name) & is.na(allSRAFinal$tissue_type) &is.na(allSRAFinal$Organism_part), ]
table(noInfo$SRA.Study) 
## 
## ERP001942 
##       464
by(noInfo, noInfo$SRA.Study, function(x) head(x, n=20)) 
## noInfo$SRA.Study: ERP001942
##        SRA.Study Center.Name    BioSample     Organism  LibrarySource Cell_type Tissue Cell_line disease ETHNICITY disease_state Diagnosis tissue_type clinical_information RACE
## 254802 ERP001942       UNIGE SAMEA1573207 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254803 ERP001942       UNIGE SAMEA1573203 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254804 ERP001942       MPIMG SAMEA1573212 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254805 ERP001942       UNIGE SAMEA1573516 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254806 ERP001942         CRG SAMEA1573527 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254807 ERP001942       MPIMG SAMEA1573339 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254808 ERP001942        LUMC SAMEA1573507 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254809 ERP001942       UNIGE SAMEA1573345 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254810 ERP001942       MPIMG SAMEA1573368 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254811 ERP001942        ICMB SAMEA1573523 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254812 ERP001942       UNIGE SAMEA1573219 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254813 ERP001942       UNIGE SAMEA1573202 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254814 ERP001942        ICMB SAMEA1573188 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254815 ERP001942        HMGU SAMEA1573508 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254816 ERP001942       UNIGE SAMEA1573329 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254817 ERP001942        LUMC SAMEA1573101 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254818 ERP001942        ICMB SAMEA1573158 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254819 ERP001942       UNIGE SAMEA1573233 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254820 ERP001942        LUMC SAMEA1573467 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
## 254821 ERP001942       UNIGE SAMEA1573324 Homo sapiens TRANSCRIPTOMIC      <NA>   <NA>      <NA>    <NA>      <NA>          <NA>      <NA>        <NA>                 <NA> <NA>
##        health_state Population DONOR_HEALTH_STATUS DONOR_ETHNICITY reported_race ReleaseDate         create_date ancestry donor_id Sample_type source_name
## 254802         <NA>        CEU                <NA>            <NA>          <NA>  2012-11-07 2022-07-21 01:31:00     <NA>     <NA>        <NA>        <NA>
## 254803         <NA>        CEU                <NA>            <NA>          <NA>  2012-11-07 2021-10-17 22:59:00     <NA>     <NA>        <NA>        <NA>
## 254804         <NA>        TSI                <NA>            <NA>          <NA>  2012-11-07 2022-07-21 03:21:00     <NA>     <NA>        <NA>        <NA>
## 254805         <NA>        CEU                <NA>            <NA>          <NA>  2012-11-07 2021-10-17 23:08:00     <NA>     <NA>        <NA>        <NA>
## 254806         <NA>        FIN                <NA>            <NA>          <NA>  2012-11-07 2021-10-17 23:14:00     <NA>     <NA>        <NA>        <NA>
## 254807         <NA>        YRI                <NA>            <NA>          <NA>  2012-11-07 2022-07-16 17:35:00     <NA>     <NA>        <NA>        <NA>
## 254808         <NA>        CEU                <NA>            <NA>          <NA>  2012-11-07 2022-07-22 21:13:00     <NA>     <NA>        <NA>        <NA>
## 254809         <NA>        FIN                <NA>            <NA>          <NA>  2012-11-07 2022-07-17 20:28:00     <NA>     <NA>        <NA>        <NA>
## 254810         <NA>        FIN                <NA>            <NA>          <NA>  2012-11-07 2022-07-22 20:59:00     <NA>     <NA>        <NA>        <NA>
## 254811         <NA>        YRI                <NA>            <NA>          <NA>  2012-11-07 2021-10-17 22:57:00     <NA>     <NA>        <NA>        <NA>
## 254812         <NA>        YRI                <NA>            <NA>          <NA>  2012-11-07 2022-07-16 21:14:00     <NA>     <NA>        <NA>        <NA>
## 254813         <NA>        FIN                <NA>            <NA>          <NA>  2012-11-07 2022-07-22 21:04:00     <NA>     <NA>        <NA>        <NA>
## 254814         <NA>        TSI                <NA>            <NA>          <NA>  2012-11-07 2022-07-13 08:22:00     <NA>     <NA>        <NA>        <NA>
## 254815         <NA>        TSI                <NA>            <NA>          <NA>  2012-11-07 2022-07-22 21:10:00     <NA>     <NA>        <NA>        <NA>
## 254816         <NA>        YRI                <NA>            <NA>          <NA>  2012-11-07 2022-07-14 22:14:00     <NA>     <NA>        <NA>        <NA>
## 254817         <NA>        FIN                <NA>            <NA>          <NA>  2012-11-07 2022-07-14 08:22:00     <NA>     <NA>        <NA>        <NA>
## 254818         <NA>        CEU                <NA>            <NA>          <NA>  2012-11-07 2021-10-17 23:08:00     <NA>     <NA>        <NA>        <NA>
## 254819         <NA>        GBR                <NA>            <NA>          <NA>  2012-11-07 2022-07-13 08:29:00     <NA>     <NA>        <NA>        <NA>
## 254820         <NA>        TSI                <NA>            <NA>          <NA>  2012-11-07 2022-07-16 19:43:00     <NA>     <NA>        <NA>        <NA>
## 254821         <NA>        YRI                <NA>            <NA>          <NA>  2012-11-07 2022-07-16 17:22:00     <NA>     <NA>        <NA>        <NA>
##        source_material_cell_type source_material Organism_part race.ethnicity primary_race SAMPLE_TYPE Sampling_site finalRace    finalGeography hispanic
## 254802                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254803                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254804                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254805                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254806                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254807                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 254808                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254809                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254810                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254811                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 254812                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 254813                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254814                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254815                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254816                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>
## 254817                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254818                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254819                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254820                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA>            Europe     <NA>
## 254821                      <NA>            <NA>          <NA>           <NA>         <NA>        <NA>          <NA>      <NA> Subsaharan Africa     <NA>

The only one there is the LCLs, ERP001942, which we’ll fix later. Looking at the table there’s a lot of redundancy between columns, or a lot of studies that report different kinds of info in different columns, so it is worth being strategic so we’re not trying to disambiguate a bunch of weird things.

Once again, false/false is where we want to be looking

table(is.na(allSRAFinal$Tissue), is.na(allSRAFinal$Cell_type)) # Fair amount of overlap
##        
##         FALSE  TRUE
##   FALSE  2568 13886
##   TRUE   2333  6601
table(is.na(allSRAFinal$Tissue), is.na(allSRAFinal$source_name)) # Lots of overlap
##        
##         FALSE TRUE
##   FALSE  7303 9151
##   TRUE   7167 1767
table(is.na(allSRAFinal$Tissue), is.na(allSRAFinal$Organism_part)) # No one uses both of these at the same time
##        
##         FALSE  TRUE
##   FALSE     0 16454
##   TRUE    467  8467
table(is.na(allSRAFinal$Cell_type), is.na(allSRAFinal$source_name))
##        
##         FALSE  TRUE
##   FALSE  2824  2077
##   TRUE  11646  8841
table(is.na(allSRAFinal$Cell_type), is.na(allSRAFinal$Organism_part))
##        
##         FALSE  TRUE
##   FALSE    20  4881
##   TRUE    447 20040

source_name is clearly the most redundant one, but I grabbed all of them anyhow and did some manual spotchecking and cleaning up of the term list, which I think is mostly ok/ok enough that results won’t change too much. We’ll want to keep an eye out on the disease columns later on, though, and on the new disease column I made from those…

Now we explore a few ways of merging data… First we sanity check the order in which to merge…

# Merge the two tissue type columns:
allSRAFinal$twoTissue <- coalesce(allSRAFinal$Tissue, allSRAFinal$tissue_type) 
length(table(allSRAFinal$twoTissue)) # A few less than when you sum the other two together, but that's expected.
table(is.na(allSRAFinal$twoTissue)) # 18k have some sort of tissue assignment

allSRAFinal$threeTissue <- coalesce(allSRAFinal$twoTissue, allSRAFinal$Organism_part)
length(table(allSRAFinal$threeTissue)) # A few less than when you sum the other two together, but that's expected.
table(is.na(allSRAFinal$threeTissue)) # only gained like 600... 

allSRAFinal$fourTissue <- coalesce(allSRAFinal$threeTissue, allSRAFinal$Cell_type)
length(table(allSRAFinal$fourTissue)) # A few less than when you sum the other two together, but that's expected.
table(is.na(allSRAFinal$fourTissue)) # 21k have some sort of tissue assignment

allSRAFinal$fiveTissue <- coalesce(allSRAFinal$fourTissue, allSRAFinal$source_name)
length(table(allSRAFinal$fiveTissue)) # A few less than when you sum the other two together, but that's expected.
table(is.na(allSRAFinal$fiveTissue)) # 21k have some sort of tissue assignment

# The missing 464 are the LCLS from that one paper, so we're good there, we'll fix them later. 

# What kind of gaps did we fill with each step?
head(allSRAFinal[!is.na(allSRAFinal$twoTissue) & is.na(allSRAFinal$Tissue),], n = 20)
head(allSRAFinal[!is.na(allSRAFinal$threeTissue) & is.na(allSRAFinal$twoTissue),], n = 20)
head(allSRAFinal[!is.na(allSRAFinal$fourTissue) & is.na(allSRAFinal$threeTissue),], n = 20)
head(allSRAFinal[!is.na(allSRAFinal$fiveTissue) & is.na(allSRAFinal$fourTissue),], n = 20)

head(sort(table(allSRAFinal$twoTissue), decreasing = T)) #
head(sort(table(allSRAFinal$threeTissue), decreasing = T)) #
head(sort(table(allSRAFinal$fourTissue), decreasing = T)) #
head(sort(table(allSRAFinal$fiveTissue), decreasing = T)) #

And now it’s time to update each of those, just for sanity checking, before we pick which approach to go with

For some reason the match command won’t work well with the indexing inside the command, so we gotta separate it out. Don’t care to troubleshoot.

cleanInfo <- read.csv("20240616_cell_type_tissue_descriptors.csv")

cleanTissue <- cleanInfo[cleanInfo$coded.as %in% "tissue",]
cleanTissue_type <- cleanInfo[cleanInfo$coded.as %in% "tissue_type",]
cleanOrganism_part <- cleanInfo[cleanInfo$coded.as %in% "Organism_part",]
cleanCell_type <- cleanInfo[cleanInfo$coded.as %in% "cell_type",]
cleanSource_name <- cleanInfo[cleanInfo$coded.as %in% "source_name",]

# This is going to be really annoying, but I think it's easiest to do it by coalescing five different updated columns at the end... except it's ten, because we do system and organ.

allSRAFinal$tissueSystem <- cleanTissue[match(allSRAFinal$Tissue, cleanTissue$Term, incomparables = NA, nomatch = NA),]$system
allSRAFinal$tissue_typeSystem <- cleanTissue_type[match(allSRAFinal$tissue_type, cleanTissue_type$Term, incomparables = NA, nomatch = NA),]$system
allSRAFinal$Organism_partSystem <- cleanOrganism_part[match(allSRAFinal$Organism_part, cleanOrganism_part$Term, incomparables = NA, nomatch = NA),]$system
allSRAFinal$Cell_typeSystem <- cleanCell_type[match(allSRAFinal$Cell_type, cleanCell_type$Term, incomparables = NA, nomatch = NA),]$system
allSRAFinal$source_nameSystem <- cleanSource_name[match(allSRAFinal$source_name, cleanSource_name$Term, incomparables = NA, nomatch = NA),]$system

allSRAFinal$tissueOrgan <- cleanTissue[match(allSRAFinal$Tissue, cleanTissue$Term, incomparables = NA, nomatch = NA),]$organ
allSRAFinal$tissue_typeOrgan <- cleanTissue_type[match(allSRAFinal$tissue_type, cleanTissue_type$Term, incomparables = NA, nomatch = NA),]$organ
allSRAFinal$Organism_partOrgan <- cleanOrganism_part[match(allSRAFinal$Organism_part, cleanOrganism_part$Term, incomparables = NA, nomatch = NA),]$organ
allSRAFinal$Cell_typeOrgan <- cleanCell_type[match(allSRAFinal$Cell_type, cleanCell_type$Term, incomparables = NA, nomatch = NA),]$organ
allSRAFinal$source_nameOrgan <- cleanSource_name[match(allSRAFinal$source_name, cleanSource_name$Term, incomparables = NA, nomatch = NA),]$organ

# And now we coalesce in the order we established above...
allSRAFinal$finalSystem <- coalesce(allSRAFinal$tissueSystem, allSRAFinal$tissue_typeSystem, allSRAFinal$Organism_partSystem, allSRAFinal$Cell_typeSystem, allSRAFinal$source_nameSystem)

allSRAFinal$finalOrgan <- coalesce(allSRAFinal$tissueOrgan, allSRAFinal$tissue_typeOrgan, allSRAFinal$Organism_partOrgan, allSRAFinal$Cell_typeOrgan, allSRAFinal$source_nameOrgan)

So that seems like it should have worked, but we should sanity check it a bit more… First, do things agree across the descriptor levels, and if not, what should we do?

table(allSRAFinal$finalSystem, allSRAFinal$tissueSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("Tissue") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalSystem, allSRAFinal$tissue_typeSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("tissue_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalSystem, allSRAFinal$Organism_partSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("Organism_part") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# Yeeeeeep these two are fucked up. Some of it will have been my old approach, but other stuff is gonna be harder to clean up. 
table(allSRAFinal$finalSystem, allSRAFinal$Cell_typeSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("Cell_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalSystem, allSRAFinal$source_nameSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("source_name") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# And now same for organs:
table(allSRAFinal$finalOrgan, allSRAFinal$tissueOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("Tissue") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalOrgan, allSRAFinal$tissue_typeOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("tissue_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalOrgan, allSRAFinal$Organism_partOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("Organism_part") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# Yeeeeeep these two are fucked up. Some of it will have been my old approach, but other stuff is gonna be harder to clean up. 
table(allSRAFinal$finalOrgan, allSRAFinal$Cell_typeOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("Cell_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalOrgan, allSRAFinal$source_nameOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("source_name") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# How often do these two agree?
table(allSRAFinal$Cell_typeOrgan, allSRAFinal$source_nameOrgan) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("source_name") +
    ylab("Cell_type") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# How often do these two agree?
table(allSRAFinal$finalOrgan, allSRAFinal$finalSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Master annotation") +
    xlab("System") +
    ylab("Organ") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

Uh oh… There’s a fair bit of disagreement when we compare the cell type based assignments and the tissue based assignments. And some kind of make sense, and some really don’t, like, intestine/iPSC or reproductive/digestive

# Let's look only at the discrepancies... write a little function to show us because otherwise it gets annoying
seeMismatches <- function(Var1, Var2){
  messySubset <- allSRAFinal[allSRAFinal[[Var1]] != allSRAFinal[[Var2]],] %>% .[!is.na(.[[Var2]]),]
  return(messySubset)
  }

cellOrganMismatches <- seeMismatches("finalOrgan", "Cell_typeOrgan")
cellSystemMismatches <- seeMismatches("finalSystem", "Cell_typeSystem")
sourceOrganMismatches <- seeMismatches("finalOrgan", "source_nameOrgan")
sourceSystemMismatches <- seeMismatches("finalSystem", "source_nameSystem")

table(cellOrganMismatches$finalOrgan, cellOrganMismatches$Cell_typeOrgan)
##               
##                blood cancer heart iPSC uterus
##   blood vessel     0      0     1    0      0
##   bone marrow     31      0     0    0      0
##   intestine        0      0     0   39      0
##   lung            66      0    12    0      0
##   ovary            0     17     0    0      1
##   skin             0      0     0   15      0
table(cellSystemMismatches$finalSystem, cellSystemMismatches$Cell_typeSystem)
##                       
##                        cancer cardiovascular connective tissue immune
##   anatomical structure      0              0                12      0
##   connective tissue        10              0                 0      0
##   hematopoetic              0              0                 0     31
##   integument                0              0                36      0
##   reproductive             17              0                 0      0
##   respiratory               0             12                 0     66
table(sourceOrganMismatches$finalOrgan, sourceOrganMismatches$source_nameOrgan)
##           
##            cancer lung lymph node
##   blood         0    8         66
##   ovary         9    0          0
##   prostate     16    0          0
##   uterus        4    0          0
table(sourceSystemMismatches$finalSystem, sourceSystemMismatches$source_nameSystem)
##               
##                cancer connective tissue lymphatic respiratory
##   immune            0                 0        66           8
##   reproductive     29               129         0           0
# Make my life easier
columnsILike <- c(1:3, 6,7,8,13,26,29,44,45,49,50,51,52)
 
# by(cellOrganMismatches, c(cellOrganMismatches$finalOrgan), function(x) head(x[,columnsILike], n = 20))
# by(cellSystemMismatches, c(cellSystemMismatches$Cell_typeSystem), function(x) head(x[,columnsILike], n = 20))
# by(sourceOrganMismatches, c(sourceOrganMismatches$finalOrgan), function(x) head(x[,columnsILike], n = 20))
# by(sourceSystemMismatches, c(sourceSystemMismatches$finalSystem), function(x) head(x[,columnsILike], n = 20))

table(cellOrganMismatches$SRA.Study)
## 
## DRP001797 SRP221289 SRP221290 SRP223239 SRP259538 SRP266877 SRP310133 SRP328303 SRP362734 
##         1         8        10        66        12        15        20        11        39
table(cellSystemMismatches$SRA.Study)
## 
## SRP070663 SRP125571 SRP219483 SRP220383 SRP221289 SRP221290 SRP223239 SRP259538 SRP310133 SRP328303 
##         4        24        10        10         7        20        66        12        20        11
table(sourceOrganMismatches$SRA.Study)
## 
## SRP118614 SRP223239 SRP290921 SRP352989 
##        16        66        13         8
table(sourceSystemMismatches$SRA.Study)
## 
## SRP118614 SRP223239 SRP225709 SRP290921 SRP352989 
##        16        66       129        13         8
# by(cellOrganMismatches, c(cellOrganMismatches$SRA.Study), function(x) head(x[,columnsILike], n = 20))
# by(cellSystemMismatches, c(cellSystemMismatches$SRA.Study), function(x) head(x[,columnsILike], n = 20))
# by(sourceOrganMismatches, c(sourceOrganMismatches$SRA.Study), function(x) head(x[,columnsILike], n = 20))
# by(sourceSystemMismatches, c(sourceSystemMismatches$SRA.Study), function(x) head(x[,columnsILike], n = 20))

Some of these are clear incorrect assignments when cleaning up the tissue stuff, and so the big spreadsheet needs to be updated, eg, the foreskin fibroblasts from Coriell which came up as urogenital (SRP070663, now fixed). However, there’s also a lot of problems with samples that do end up in different classes depending of which column one looks at, eg, SRP266877 should really be under iPSC but isn’t because tissue is given as blood, even though it’s an established cell line… Not all the cases are clear cut, and some are straight up contradictory, eg, DRP001797, which says endocardial cells but also endothelium of blood vessels, or, SRP056287, which gives cell type as cortical neurons, and tissue as fibroblast and which seem to be iPSC-derived cortical neurons (updated), or SRP259538, which lists cardiomyocyte and foetal lung (but they’re cell-line derived anyhow)

There’s a lot of immune cells sampled in the lung, and I’m not sure what to do with them - keep them as both things?

So that’s a first pass, but we need to get rid of established cell lines and then come back to some of this. Next, some checks for missingness etc

table(allSRAFinal$finalSystem)
## 
## anatomical structure               cancer       cardiovascular    connective tissue        developmental            digestive            endocrine              enteric 
##                  772                  627                  996                  227                   63                 1313                   56                   12 
##         hematopoetic              hepatic               immune           integument            lymphatic             muscular              nervous         reproductive 
##                  353                  344                13621                  408                   40                  823                 1783                  736 
##          respiratory             skeletal           urogenital               visual 
##                  708                  130                  175                   47
table(allSRAFinal$finalOrgan)
## 
##         adipose   adrenal gland   biliary tract         bladder      blastoderm           blood    blood vessel            bone     bone marrow           brain          breast 
##             107               3              10              38              22           13618             222              35             353            1688             658 
##          cancer       cartilage             CNS digestive tract             eye  fallopian tube     gallbladder           heart       intestine            iPSC           joint 
##             637               3              79              59              47               8               3             774            1036            1457              92 
##          kidney          larynx           liver            lung      lymph node          morula          muscle            neck            nose     oral cavity           ovary 
##              59               1             344             594              33              41             823               4             114              97             321 
##        pancreas pituitary gland        placenta             PNS        prostate            skin          spleen         stomach          testis          thymus         thyroid 
##              98               7              32              28             259             408               3             107              16               1              46 
##          tonsil         trachea   urinary tract          uterus          vagina 
##               6              12              78              84              16
table(is.na(allSRAFinal$finalSystem), is.na(allSRAFinal$finalOrgan))
##        
##         FALSE  TRUE
##   FALSE 23128   106
##   TRUE   1453   701
table(is.na(allSRAFinal$finalSystem)) 
## 
## FALSE  TRUE 
## 23234  2154
table(is.na(allSRAFinal$finalOrgan))
## 
## FALSE  TRUE 
## 24581   807
noAssignment <- allSRAFinal[is.na(allSRAFinal$finalOrgan) & is.na(allSRAFinal$finalSystem),]
table(noAssignment$SRA.Study)
## 
## DRP001150 ERP001942 ERP122083 ERP122103 ERP122256 SRP151215 SRP154973 SRP179998 SRP181079 SRP181649 SRP186687 SRP343308 SRP344328 
##         1       464         6         1         5        25         4        15         8        15        19       124        14
# by(noAssignment, noAssignment$SRA.Study, function(x) head(x))

Most of these had info in one of the tissue columns, but sometimes I had to go back to run selector and dig a bit deeper.

# What are the NAs...? 464 Of them are the LCL study, so let's fix that, first of all:
allSRAFinal[allSRAFinal$SRA.Study %in% c("ERP001942", "SRP151215", "SRP181079"),]$finalSystem <- "immune"
allSRAFinal[allSRAFinal$SRA.Study %in% c("ERP001942", "SRP151215", "SRP181079"),]$finalOrgan <- "blood"

allSRAFinal[allSRAFinal$SRA.Study %in% c("ERP122083", "ERP122103", "ERP122256"),]$finalSystem <- "cancer"
allSRAFinal[allSRAFinal$SRA.Study %in% c("ERP122083", "ERP122103", "ERP122256"),]$finalOrgan <- "cancer"

allSRAFinal[allSRAFinal$SRA.Study %in% "SRP344328",]$finalSystem <- "hepatic"
allSRAFinal[allSRAFinal$SRA.Study %in% "SRP344328",]$finalOrgan <- "liver"

allSRAFinal[allSRAFinal$SRA.Study %in% "SRP343308",]$finalSystem <- "digestive"
allSRAFinal[allSRAFinal$SRA.Study %in% "SRP343308",]$finalOrgan <- "intestine"

allSRAFinal[allSRAFinal$SRA.Study %in% "SRP179998",]$finalSystem <- "nervous"
allSRAFinal[allSRAFinal$SRA.Study %in% "SRP179998",]$finalOrgan <- "brain"

allSRAFinal[allSRAFinal$SRA.Study %in% "SRP154973",]$finalSystem <- "respiratory"
allSRAFinal[allSRAFinal$SRA.Study %in% "SRP154973",]$finalOrgan <- "lung"

noAssignment <- allSRAFinal[is.na(allSRAFinal$finalOrgan) & is.na(allSRAFinal$finalSystem),]
table(noAssignment$SRA.Study)
## 
## DRP001150 SRP181649 SRP186687 
##         1        15        19
# by(noAssignment, noAssignment$SRA.Study, function(x) head(x))

# Remove that mixed tissue one. 
allSRAFinal <- allSRAFinal[!(allSRAFinal$SRA.Study %in% "DRP001150"),]

rm(noAssignment)

Much better… so we’ve resolved the NAs, but not the tissue ambiguities, which I think might just have to stay as is.

Now we look at the rows with info encoded in the Cell_line column, in addition to our organ and tissue assignments. Lots of these are cell lines that have ancestry info, but are established/immortalised, which was not included in our initial search… but should it have been? A key culprit is SRP186687, which is almost 1000 cancer samples with sequencing.

The brutal approach is to simply remove the entire study, and then come back again. But it would be smarter to also whitelist some terms and not just remove the column, or remove any rows with info in the column. SRP347253 has some epic cell line names, for instance, and all the occurrences of LCL etc should be kept - but first gotta check they’re not already covered in other columns!

table(!is.na(allSRAFinal$Cell_line))
## 
## FALSE  TRUE 
## 22966  2421
allCellLine <- allSRAFinal[!is.na(allSRAFinal$Cell_line),]
sort(table(allCellLine$SRA.Study), decreasing=T)
## 
## SRP186687 SRP347253 ERP114122 SRP344545 SRP132693 SRP154009 SRP323748 SRP362734 SRP106527 SRP217300 SRP090531 SRP125571 SRP151215 SRP103111 SRP174206 SRP223674 SRP330940 SRP221374 
##       979       304       159       108        91        44        40        39        30        30        29        28        25        24        24        24        24        23 
## SRP075592 SRP237644 SRP408226 SRP292295 SRP191042 SRP222944 SRP292536 SRP056109 SRP073347 SRP367525 SRP266877 SRP269147 SRP155367 SRP178235 SRP179498 SRP193979 SRP259538 SRP334792 
##        21        21        20        19        18        18        18        16        16        16        15        14        12        12        12        12        12        12 
## SRP352928 SRP131505 SRP198410 SRP219483 SRP220383 SRP091668 SRP041840 SRP057446 SRP060253 SRP194087 SRP220267 SRP070663 SRP052896 SRP066625 SRP181649 SRP298738 SRP074707 SRP111915 
##        12        10        10        10        10         8         6         6         6         6         6         4         3         3         3         3         2         2 
## SRP164930 
##         2
head(sort(table(allCellLine$Cell_line), decreasing=T), n =50) # Some of these are clear keepers, some of the are clear removal. But the question is which is which, and how do I make my life easier? Let's check the big ones first:
## 
##                     lymphoblastoid cell line                                     organoid                                       raisin                                          AGS 
##                                          159                                           39                                           36                                           26 
##                                         PBMC                                        HBMEC             lymphoblastoid cell lines (LCLs)                                         MCF7 
##                                           26                                           25                                           25                                           25 
##                                        HepG2                                motor neurons                      PC-3 (ATCC\\, CRL-1435)                                 B lymphocyte 
##                                           24                                           24                                           24                                           23 
##                                       SKOV-3                                   MDA-MB-231                                       PANC-1             primary vaginal epithelial cells 
##                                           21                                           19                                           17                                           16 
##                                       WTC-11                                       merlot                                       butter                                          sun 
##                                           15                                           14                                           13                                           13 
##                     HB-8065 (Hep G2) [HEPG2]                    hiPSC-derived trophoblast                 IMR90-derived cardiomyocytes primarily peripheral blood mononuclear cells 
##                                           12                                           12                                           12                                           12 
##                                WC-24-02-DS-B                                WC-24-02-DS-M                                        dijon                                        lilac 
##                                           12                                           12                                           11                                           11 
##                                       violet                                         A549                                         corn                                       EFO-21 
##                                           11                                           10                                           10                                           10 
##                                   fibroblast                                        grape                                       HDLM-2                                   MDA-MB-468 
##                                           10                                           10                                           10                                           10 
##                                       orchid                                        23555                                        28815                                        apple 
##                                           10                                            9                                            9                                            9 
##                                  fibroblasts                                        honey                                       RWPE-1                                        21792 
##                                            9                                            9                                            9                                            8 
##                                        27322                                        29089                                       coffee                                       H21792 
##                                            8                                            8                                            8                                            8 
##                                       H25237                                       H28815 
##                                            8                                            8
# by(allCellLine, allCellLine$SRA.Study, function(x) sort(table(x$finalOrgan),  decreasing=T))
# by(allCellLine, allCellLine$SRA.Study, function(x) sort(table(x$finalSystem),  decreasing=T))
# by(allCellLine, allCellLine$SRA.Study, function(x) sort(table(x$Cell_line),  decreasing=T))
# by(allCellLine, allCellLine$SRA.Study, function(x) head(x[,columnsILike], n=20))
rm(allCellLine)

Let’s organise this a bit:

Studies to exclude:

  • SRP186687: the huge cancer study, currently assigned to various organs and lots of NA. exclude.
  • SRP217300: MIA PaCa, established breast cancer cell line. exclude.
  • SRP090531: mix of established cell lines and lines that closely match other cellosaurus IDs. exclude.
  • SRP103111: PC-3. established. exclude.
  • SRP075592: established. exclude.
  • SRP237644: established. exclude.
  • SRP408226: HepG2. exclude.
  • SRP292295: established cell line. exclude.
  • SRP191042: established. exclude.
  • SRP292536: established cancer cell lines. exclude.
  • SRP056109: established cancer cell line. exclude.
  • SRP269147: established cancer cell lines. exclude.
  • SRP178235: established. exclude.
  • SRP179498: established. exclude.
  • SRP193979: HepG2. exclude.
  • SRP259538: established cell line. exclude.
  • SRP334792: prostate cancer cell lines from ATCC. exclude.
  • SRP131505: established cancer cell lines. exclude.
  • SRP091668: established. exclude.
  • SRP041840: established. exclude.
  • SRP057446: HepG2 and related. exclude.
  • SRP060253: AGS. exclude.
  • SRP194087: established. exclude.
  • SRP220267: Caco2. exclude.
  • SRP066625: HL-60/S4. exclude
  • SRP181649: Huh-7. exclude.
  • SRP298738: established AML cell line. exclude.
  • SRP074707: established. exclude.
  • SRP111915: established. exclude.
  • SRP164930: established. exclude.

Studies to keep:

These are mostly iPSCs and derivatives, so not too sure what to do with them:

  • SRP347253: the big iPSC panel, correctly assigned to iPSCs. ok
  • SRP132693: iPSC, iPSC derived cardiomyocytes and primary heart tissue. correctly labelled. ok.
  • SRP154009: iPSC and iPSC derived cardiomyocytes. correctly labelled. ok
  • SRP323748: SRA accession says they’re all iPSC derived cardiomyocytes from iPSCore, and this is recorded in the ‘cell_subtype’ column. need to update. Unsure.
  • SRP362734: colorectal cancer organoids… currently processed as if colorectal. need to update. unsure.
  • SRP125571: iPSC-derived cortical neurons (in Sample_type) from coriell fibroblasts. need to update. unsure.
  • SRP223674: iPSC-derived motor neurons, incorrectly labelled as blood (starting material). need to update. unsure.
  • SRP330940: iPSCs and iPSC-derived material from publicly available line. unsure.
  • SRP222944: iPSC-derived tissues, labelled as peripheral blood. need to update. unsure.
  • SRP266877: WTC-11 iPSC cell line. need to update. unsure.
  • SRP155367: iPSC and iPSC derived material. SRA says from Coriell, but neither cellosaurus nor coriell have heard of this. unsure.
  • SRP070663: Coriell cell lines and iPSCs from them, correctly assigned. unsure.

These are easy to justify keeping:

So that’s a quick summary. Let’s clean this up and then check out what happens with all of those disagreements from before; this has also identified a few that need to be updated:

# First we make a list of a lot of studies to exclude:
establishedStudies <- c("SRP164930", "SRP111915", "SRP074707", "SRP298738", "SRP181649", "SRP066625", "SRP220267", "SRP194087", "SRP060253", "SRP057446", "SRP041840", "SRP091668", "SRP131505", "SRP334792", "SRP259538", "SRP193979", "SRP179498", "SRP178235", "SRP269147", "SRP056109", "SRP292536", "SRP191042", "SRP292295", "SRP408226", "SRP237644", "SRP075592", "SRP103111", "SRP090531", "SRP217300", "SRP186687")

allSRAFinal <- allSRAFinal[!(allSRAFinal$SRA.Study %in% establishedStudies),]
dim(allSRAFinal) # Looks better
## [1] 24028    48
# Then we gotta make sure all the other tissue assignments are ok:
allCellLine <- allSRAFinal[allSRAFinal$SRA.Study %in% allSRAFinal[!is.na(allSRAFinal$Cell_line),]$SRA.Study,] # Yes again
by(allCellLine, allCellLine$SRA.Study, function(x) sort(table(x$finalOrgan),  decreasing=T))
## allCellLine$SRA.Study: ERP114122
## blood 
##   159 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP052896
## lung 
##    1 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP070663
## 
## iPSC skin 
##    8    4 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP073347
## blood 
##    16 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP106527
## blood 
##    30 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP125571
## 
## oral cavity        skin 
##          12          12 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP132693
## 
##  iPSC heart 
##    69    22 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP151215
## blood 
##    25 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP154009
## iPSC 
##   44 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP155367
## iPSC 
##   12 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP174206
## 
## blood  skin 
##    14    10 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP198410
## blood 
##    10 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP219483
## skin 
##   10 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP220383
## skin 
##   10 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP221374
## blood 
##    23 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP222944
## blood 
##    18 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP223674
## 
## blood  skin 
##    18     6 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP266877
## skin 
##   15 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP323748
## 
##  skin blood 
##    36     4 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP330940
## iPSC 
##   24 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP344545
## blood 
##   108 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP347253
## iPSC 
##  304 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP352928
## blood 
##    12 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP362734
## intestine 
##        39 
## --------------------------------------------------------------------------------------------------------------------------------------- 
## allCellLine$SRA.Study: SRP367525
## vagina 
##     16
# Some more updates...
allSRAFinal[allSRAFinal$SRA.Study %in% c("SRP362734", "SRP323748", "SRP266877", "SRP223674", "SRP222944", "SRP125571"),]$finalSystem <- NA
allSRAFinal[allSRAFinal$SRA.Study %in% c("SRP362734", "SRP323748", "SRP266877", "SRP223674", "SRP222944", "SRP125571"),]$finalOrgan <- "iPSC"

# I guess we could make all the plots again... but easier to just tally disagreements?
table(allSRAFinal$finalSystem, allSRAFinal$Cell_typeSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("Cell_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalSystem, allSRAFinal$source_nameSystem) %>% melt (.) %>%
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("System annotation") +
    xlab("source_name") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# And now same for organs:
table(allSRAFinal$finalOrgan, allSRAFinal$Cell_typeOrgan) %>% melt (.) %>% 
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("Cell_type") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

table(allSRAFinal$finalOrgan, allSRAFinal$source_nameOrgan) %>% melt (.) %>% 
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Organ annotation") +
    xlab("source_name") +
    ylab("Final") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

# And the final table... this should be messy, but there should be some sanity to it.
table(allSRAFinal$finalOrgan, allSRAFinal$finalSystem) %>% melt(.) %>% 
  ggplot(., aes(Var.2, Var.1)) +
    geom_tile(aes(fill = log2(value)), colour = "white") +
    scale_fill_gradient(low = "white", high = "red") +
    ggtitle("Master annotation") +
    xlab("System") +
    ylab("Organ") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

I think it all looks mostly good now, with the open question of what to do with those where there’s a mismatch between annotation derived from either Cell_type or source_name and the other columns. It’s not going to make a big difference, I guess, so I think we’re ready to move on to real analyses… let’s just look at things again tho:

dim(allSRAFinal)
## [1] 24028    48
cellOrganMismatches <- seeMismatches("finalOrgan", "Cell_typeOrgan")
cellSystemMismatches <- seeMismatches("finalSystem", "Cell_typeSystem")
sourceOrganMismatches <- seeMismatches("finalOrgan", "source_nameOrgan")
sourceSystemMismatches <- seeMismatches("finalSystem", "source_nameSystem")

table(cellOrganMismatches$finalOrgan, cellOrganMismatches$Cell_typeOrgan)
##               
##                blood cancer heart uterus
##   blood vessel     0      0     1      0
##   bone marrow     31      0     0      0
##   lung            66      0     0      0
##   ovary            0     17     0      1
table(cellSystemMismatches$finalSystem, cellSystemMismatches$Cell_typeSystem)
##                    
##                     cancer connective tissue immune
##   connective tissue     10                 0      0
##   hematopoetic           0                 0     31
##   integument             0                24      0
##   reproductive          17                 0      0
##   respiratory            0                 0     66
table(sourceOrganMismatches$finalOrgan, sourceOrganMismatches$source_nameOrgan)
##           
##            cancer lung lymph node
##   blood         0    8         66
##   lung          4    0          0
##   ovary         9    0          0
##   prostate     16    0          0
##   uterus        4    0          0
table(sourceSystemMismatches$finalSystem, sourceSystemMismatches$source_nameSystem)
##               
##                cancer connective tissue lymphatic respiratory
##   immune            0                 0        66           8
##   reproductive     29               129         0           0
##   respiratory       4                 0         0           0
table(cellOrganMismatches$SRA.Study)
## 
## DRP001797 SRP221289 SRP221290 SRP223239 SRP310133 SRP328303 
##         1         8        10        66        20        11
table(cellSystemMismatches$SRA.Study)
## 
## SRP070663 SRP219483 SRP220383 SRP221289 SRP221290 SRP223239 SRP310133 SRP328303 
##         4        10        10         7        20        66        20        11
table(sourceOrganMismatches$SRA.Study)
## 
## SRP118614 SRP154973 SRP223239 SRP290921 SRP352989 
##        16         4        66        13         8
table(sourceSystemMismatches$SRA.Study)
## 
## SRP118614 SRP154973 SRP223239 SRP225709 SRP290921 SRP352989 
##        16         4        66       129        13         8
# Yep, not too fussed. So now we can remove some of the temp columns and write this out
allSRAFinal <- allSRAFinal[,-(37:46)]
saveRDS(allSRAFinal, "allSRAFinalTissues.rds")

And we’re done!